import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
os.chdir('C:\\Users\\lance\OneDrive\Desktop\\Feynn Labs\\')
df=pd.read_csv('Placement_Data_Full_Class.csv')
df
| sl_no | gender | ssc_p | ssc_b | hsc_p | hsc_b | hsc_s | degree_p | degree_t | workex | etest_p | specialisation | mba_p | status | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | M | 67.00 | Others | 91.00 | Others | Commerce | 58.00 | Sci&Tech | No | 55.0 | Mkt&HR | 58.80 | Placed | 270000.0 |
| 1 | 2 | M | 79.33 | Central | 78.33 | Others | Science | 77.48 | Sci&Tech | Yes | 86.5 | Mkt&Fin | 66.28 | Placed | 200000.0 |
| 2 | 3 | M | 65.00 | Central | 68.00 | Central | Arts | 64.00 | Comm&Mgmt | No | 75.0 | Mkt&Fin | 57.80 | Placed | 250000.0 |
| 3 | 4 | M | 56.00 | Central | 52.00 | Central | Science | 52.00 | Sci&Tech | No | 66.0 | Mkt&HR | 59.43 | Not Placed | NaN |
| 4 | 5 | M | 85.80 | Central | 73.60 | Central | Commerce | 73.30 | Comm&Mgmt | No | 96.8 | Mkt&Fin | 55.50 | Placed | 425000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 210 | 211 | M | 80.60 | Others | 82.00 | Others | Commerce | 77.60 | Comm&Mgmt | No | 91.0 | Mkt&Fin | 74.49 | Placed | 400000.0 |
| 211 | 212 | M | 58.00 | Others | 60.00 | Others | Science | 72.00 | Sci&Tech | No | 74.0 | Mkt&Fin | 53.62 | Placed | 275000.0 |
| 212 | 213 | M | 67.00 | Others | 67.00 | Others | Commerce | 73.00 | Comm&Mgmt | Yes | 59.0 | Mkt&Fin | 69.72 | Placed | 295000.0 |
| 213 | 214 | F | 74.00 | Others | 66.00 | Others | Commerce | 58.00 | Comm&Mgmt | No | 70.0 | Mkt&HR | 60.23 | Placed | 204000.0 |
| 214 | 215 | M | 62.00 | Central | 58.00 | Others | Science | 53.00 | Comm&Mgmt | No | 89.0 | Mkt&HR | 60.22 | Not Placed | NaN |
215 rows × 15 columns
df.describe()
| sl_no | ssc_p | hsc_p | degree_p | etest_p | mba_p | salary | |
|---|---|---|---|---|---|---|---|
| count | 215.000000 | 215.000000 | 215.000000 | 215.000000 | 215.000000 | 215.000000 | 148.000000 |
| mean | 108.000000 | 67.303395 | 66.333163 | 66.370186 | 72.100558 | 62.278186 | 288655.405405 |
| std | 62.209324 | 10.827205 | 10.897509 | 7.358743 | 13.275956 | 5.833385 | 93457.452420 |
| min | 1.000000 | 40.890000 | 37.000000 | 50.000000 | 50.000000 | 51.210000 | 200000.000000 |
| 25% | 54.500000 | 60.600000 | 60.900000 | 61.000000 | 60.000000 | 57.945000 | 240000.000000 |
| 50% | 108.000000 | 67.000000 | 65.000000 | 66.000000 | 71.000000 | 62.000000 | 265000.000000 |
| 75% | 161.500000 | 75.700000 | 73.000000 | 72.000000 | 83.500000 | 66.255000 | 300000.000000 |
| max | 215.000000 | 89.400000 | 97.700000 | 91.000000 | 98.000000 | 77.890000 | 940000.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 215 entries, 0 to 214 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sl_no 215 non-null int64 1 gender 215 non-null object 2 ssc_p 215 non-null float64 3 ssc_b 215 non-null object 4 hsc_p 215 non-null float64 5 hsc_b 215 non-null object 6 hsc_s 215 non-null object 7 degree_p 215 non-null float64 8 degree_t 215 non-null object 9 workex 215 non-null object 10 etest_p 215 non-null float64 11 specialisation 215 non-null object 12 mba_p 215 non-null float64 13 status 215 non-null object 14 salary 148 non-null float64 dtypes: float64(6), int64(1), object(8) memory usage: 25.3+ KB
plt.figure()
df['salary'].plot(kind='density')
skew_sal=df['salary'].skew()
kurt_sal=df['salary'].kurt()
print('skew_sal:{}'.format(skew_sal))
print('kurt_sal:{}'.format(kurt_sal))
skew_sal:3.5697471998711054 kurt_sal:18.54427337222016
plt.figure()
df['mba_p'].plot(kind='density')
skew_mba_p=df['mba_p'].skew()
kurt_mba_p=df['mba_p'].kurt()
print('skew_mba_p:{}'.format(skew_mba_p))
print('kurt_mba_p:{}'.format(kurt_mba_p))
skew_mba_p:0.31357565317840763 kurt_mba_p:-0.47072269614559437
px.histogram(df,x='salary',color='specialisation')
px.histogram(df,x='salary',histnorm='probability density')
px.histogram(df,x='salary',color='gender')
px.histogram(df,x='status',color='specialisation')
import scipy.stats as stats
df_table=pd.crosstab(df['specialisation'],df['status'])
print(df_table)
status Not Placed Placed specialisation Mkt&Fin 25 95 Mkt&HR 42 53
Observed_Values=df_table.values
val=stats.chi2_contingency(df_table)
val
(12.440229009203623,
0.00042018425858864284,
1,
array([[37.39534884, 82.60465116],
[29.60465116, 65.39534884]]))
Expected_Values=val[3]
Expected_Values
array([[37.39534884, 82.60465116],
[29.60465116, 65.39534884]])
no_of_rows=len(df_table.iloc[0:2,0])
no_of_columns=len(df_table.iloc[0,0:2])
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",ddof)
alpha = 0.05
Degree of Freedom:- 1
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]
critical_value=chi2.ppf(q=1-alpha,df=ddof)
print('critical_value:',critical_value)
critical_value: 3.841458820694124
#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)
print('p-value:',p_value)
p-value: 0.0002375467465819403 Significance level: 0.05 Degree of Freedom: 1 p-value: 0.0002375467465819403
if chi_square_statistic>=critical_value:
print("Reject H0,There is a relationship between 2 categorical variables")
else:
print("Retain H0,There is no relationship between 2 categorical variables")
if p_value<=alpha:
print("Reject H0,There is a relationship between 2 categorical variables")
else:
print("Retain H0,There is no relationship between 2 categorical variables")
Reject H0,There is a relationship between 2 categorical variables Reject H0,There is a relationship between 2 categorical variables
df1=df.copy()
X = df1. drop(labels='status', axis=1)
X
| sl_no | gender | ssc_p | ssc_b | hsc_p | hsc_b | hsc_s | degree_p | degree_t | workex | etest_p | specialisation | mba_p | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | M | 67.00 | Others | 91.00 | Others | Commerce | 58.00 | Sci&Tech | No | 55.0 | Mkt&HR | 58.80 | 270000.0 |
| 1 | 2 | M | 79.33 | Central | 78.33 | Others | Science | 77.48 | Sci&Tech | Yes | 86.5 | Mkt&Fin | 66.28 | 200000.0 |
| 2 | 3 | M | 65.00 | Central | 68.00 | Central | Arts | 64.00 | Comm&Mgmt | No | 75.0 | Mkt&Fin | 57.80 | 250000.0 |
| 3 | 4 | M | 56.00 | Central | 52.00 | Central | Science | 52.00 | Sci&Tech | No | 66.0 | Mkt&HR | 59.43 | NaN |
| 4 | 5 | M | 85.80 | Central | 73.60 | Central | Commerce | 73.30 | Comm&Mgmt | No | 96.8 | Mkt&Fin | 55.50 | 425000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 210 | 211 | M | 80.60 | Others | 82.00 | Others | Commerce | 77.60 | Comm&Mgmt | No | 91.0 | Mkt&Fin | 74.49 | 400000.0 |
| 211 | 212 | M | 58.00 | Others | 60.00 | Others | Science | 72.00 | Sci&Tech | No | 74.0 | Mkt&Fin | 53.62 | 275000.0 |
| 212 | 213 | M | 67.00 | Others | 67.00 | Others | Commerce | 73.00 | Comm&Mgmt | Yes | 59.0 | Mkt&Fin | 69.72 | 295000.0 |
| 213 | 214 | F | 74.00 | Others | 66.00 | Others | Commerce | 58.00 | Comm&Mgmt | No | 70.0 | Mkt&HR | 60.23 | 204000.0 |
| 214 | 215 | M | 62.00 | Central | 58.00 | Others | Science | 53.00 | Comm&Mgmt | No | 89.0 | Mkt&HR | 60.22 | NaN |
215 rows × 14 columns
y=df1['status']
y.shape
(215,)
sns.pairplot(df1,hue='status',height=3)
<seaborn.axisgrid.PairGrid at 0x18acf08ce80>
df1.nunique()
sl_no 215 gender 2 ssc_p 103 ssc_b 2 hsc_p 97 hsc_b 2 hsc_s 3 degree_p 89 degree_t 3 workex 2 etest_p 100 specialisation 2 mba_p 205 status 2 salary 45 dtype: int64
X.isna().sum()
sl_no 0 gender 0 ssc_p 0 ssc_b 0 hsc_p 0 hsc_b 0 hsc_s 0 degree_p 0 degree_t 0 workex 0 etest_p 0 specialisation 0 mba_p 0 salary 67 dtype: int64
y.isna().sum()
0
df.plot(kind='box')
<AxesSubplot:>
X.hist(figsize=(15,10))
array([[<AxesSubplot:title={'center':'sl_no'}>,
<AxesSubplot:title={'center':'ssc_p'}>,
<AxesSubplot:title={'center':'hsc_p'}>],
[<AxesSubplot:title={'center':'degree_p'}>,
<AxesSubplot:title={'center':'etest_p'}>,
<AxesSubplot:title={'center':'mba_p'}>],
[<AxesSubplot:title={'center':'salary'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)
y.hist()
<AxesSubplot:>
#fill missing Values
X['salary'].fillna(X['salary'].median(),inplace=True)
X
| sl_no | gender | ssc_p | ssc_b | hsc_p | hsc_b | hsc_s | degree_p | degree_t | workex | etest_p | specialisation | mba_p | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | M | 67.00 | Others | 91.00 | Others | Commerce | 58.00 | Sci&Tech | No | 55.0 | Mkt&HR | 58.80 | 270000.0 |
| 1 | 2 | M | 79.33 | Central | 78.33 | Others | Science | 77.48 | Sci&Tech | Yes | 86.5 | Mkt&Fin | 66.28 | 200000.0 |
| 2 | 3 | M | 65.00 | Central | 68.00 | Central | Arts | 64.00 | Comm&Mgmt | No | 75.0 | Mkt&Fin | 57.80 | 250000.0 |
| 3 | 4 | M | 56.00 | Central | 52.00 | Central | Science | 52.00 | Sci&Tech | No | 66.0 | Mkt&HR | 59.43 | 265000.0 |
| 4 | 5 | M | 85.80 | Central | 73.60 | Central | Commerce | 73.30 | Comm&Mgmt | No | 96.8 | Mkt&Fin | 55.50 | 425000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 210 | 211 | M | 80.60 | Others | 82.00 | Others | Commerce | 77.60 | Comm&Mgmt | No | 91.0 | Mkt&Fin | 74.49 | 400000.0 |
| 211 | 212 | M | 58.00 | Others | 60.00 | Others | Science | 72.00 | Sci&Tech | No | 74.0 | Mkt&Fin | 53.62 | 275000.0 |
| 212 | 213 | M | 67.00 | Others | 67.00 | Others | Commerce | 73.00 | Comm&Mgmt | Yes | 59.0 | Mkt&Fin | 69.72 | 295000.0 |
| 213 | 214 | F | 74.00 | Others | 66.00 | Others | Commerce | 58.00 | Comm&Mgmt | No | 70.0 | Mkt&HR | 60.23 | 204000.0 |
| 214 | 215 | M | 62.00 | Central | 58.00 | Others | Science | 53.00 | Comm&Mgmt | No | 89.0 | Mkt&HR | 60.22 | 265000.0 |
215 rows × 14 columns
# #Removing outliars
# import scipy.stats as stats
# #find absolute value of z-score for each observation
# z = np.abs(stats.zscore(X['salary']))
# #only keep rows in dataframe with all z-scores less than absolute value of 3
# z = np.abs(stats.zscore(X['salary']))
# print(z)
# print(np.where(z > 3))
# Encode Target Colu
from sklearn.preprocessing import LabelEncoder
# Encode for string labels
le =LabelEncoder()
y = label_encoder.fit_transform(y)
y.shape
(215,)
y
array([1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1,
0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0])
y.shape
(215,)
X.shape
(215, 14)
X
| sl_no | gender | ssc_p | ssc_b | hsc_p | hsc_b | hsc_s | degree_p | degree_t | workex | etest_p | specialisation | mba_p | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | M | 67.00 | Others | 91.00 | Others | Commerce | 58.00 | Sci&Tech | No | 55.0 | Mkt&HR | 58.80 | 270000.0 |
| 1 | 2 | M | 79.33 | Central | 78.33 | Others | Science | 77.48 | Sci&Tech | Yes | 86.5 | Mkt&Fin | 66.28 | 200000.0 |
| 2 | 3 | M | 65.00 | Central | 68.00 | Central | Arts | 64.00 | Comm&Mgmt | No | 75.0 | Mkt&Fin | 57.80 | 250000.0 |
| 3 | 4 | M | 56.00 | Central | 52.00 | Central | Science | 52.00 | Sci&Tech | No | 66.0 | Mkt&HR | 59.43 | 265000.0 |
| 4 | 5 | M | 85.80 | Central | 73.60 | Central | Commerce | 73.30 | Comm&Mgmt | No | 96.8 | Mkt&Fin | 55.50 | 425000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 210 | 211 | M | 80.60 | Others | 82.00 | Others | Commerce | 77.60 | Comm&Mgmt | No | 91.0 | Mkt&Fin | 74.49 | 400000.0 |
| 211 | 212 | M | 58.00 | Others | 60.00 | Others | Science | 72.00 | Sci&Tech | No | 74.0 | Mkt&Fin | 53.62 | 275000.0 |
| 212 | 213 | M | 67.00 | Others | 67.00 | Others | Commerce | 73.00 | Comm&Mgmt | Yes | 59.0 | Mkt&Fin | 69.72 | 295000.0 |
| 213 | 214 | F | 74.00 | Others | 66.00 | Others | Commerce | 58.00 | Comm&Mgmt | No | 70.0 | Mkt&HR | 60.23 | 204000.0 |
| 214 | 215 | M | 62.00 | Central | 58.00 | Others | Science | 53.00 | Comm&Mgmt | No | 89.0 | Mkt&HR | 60.22 | 265000.0 |
215 rows × 14 columns
#Creating pipeline
#https://towardsdatascience.com/step-by-step-tutorial-of-sci-kit-learn-pipeline-62402d5629b6
X=pd.get_dummies(X,drop_first=True)
X
| sl_no | ssc_p | hsc_p | degree_p | etest_p | mba_p | salary | gender_M | ssc_b_Others | hsc_b_Others | hsc_s_Commerce | hsc_s_Science | degree_t_Others | degree_t_Sci&Tech | workex_Yes | specialisation_Mkt&HR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 67.00 | 91.00 | 58.00 | 55.0 | 58.80 | 270000.0 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 1 |
| 1 | 2 | 79.33 | 78.33 | 77.48 | 86.5 | 66.28 | 200000.0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 |
| 2 | 3 | 65.00 | 68.00 | 64.00 | 75.0 | 57.80 | 250000.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 56.00 | 52.00 | 52.00 | 66.0 | 59.43 | 265000.0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
| 4 | 5 | 85.80 | 73.60 | 73.30 | 96.8 | 55.50 | 425000.0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 210 | 211 | 80.60 | 82.00 | 77.60 | 91.0 | 74.49 | 400000.0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 211 | 212 | 58.00 | 60.00 | 72.00 | 74.0 | 53.62 | 275000.0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| 212 | 213 | 67.00 | 67.00 | 73.00 | 59.0 | 69.72 | 295000.0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
| 213 | 214 | 74.00 | 66.00 | 58.00 | 70.0 | 60.23 | 204000.0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
| 214 | 215 | 62.00 | 58.00 | 53.00 | 89.0 | 60.22 | 265000.0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
215 rows × 16 columns
numeric_features = ['ssc_p', 'hsc_p', 'mba_p', 'degree_p','etest_p','salary']
categorical_features = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex','specialisation']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X= scaler.fit_transform(X)
X
array([[-1.72401341, -0.02808697, 2.2688123 , ..., 1.62605898,
-0.72444647, 1.12390297],
[-1.70790113, 1.11336869, 1.10344799, ..., 1.62605898,
1.38036423, -0.88975652],
[-1.69178886, -0.21323793, 0.15331275, ..., -0.61498384,
-0.72444647, -0.88975652],
...,
[ 1.69178886, -0.02808697, 0.06133451, ..., -0.61498384,
1.38036423, -0.88975652],
[ 1.70790113, 0.61994138, -0.03064373, ..., -0.61498384,
-0.72444647, 1.12390297],
[ 1.72401341, -0.49096436, -0.76646966, ..., -0.61498384,
-0.72444647, 1.12390297]])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,stratify=y)
#Pipeline(steps=[('name_of_preprocessor', preprocessor),
# ('name_of_ml_model', ml_model())])
X_train.shape
(172, 16)
X_test.shape
(43, 16)
y_train.shape
(172,)
y_test.shape
(43,)
#Preprocessor
X_train
X_test
array([[ 1.62733976e+00, -2.24989846e+00, -5.82513175e-01,
-1.86632911e-01, 1.55223634e+00, 1.10281049e-01,
-2.08630946e-01, -1.35238581e+00, -9.23822643e-01,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[-4.35031420e-01, 1.57063988e-01, -3.98556692e-01,
3.58206709e-01, -3.85091762e-01, 1.23395855e-02,
-5.28936955e-01, -1.35238581e+00, 1.08245885e+00,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
1.12390297e+00],
[ 7.89501466e-01, 1.56421127e+00, 2.82082292e-01,
-1.86632911e-01, 5.96407276e-01, -1.64235567e+00,
-3.36753350e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, 1.38036423e+00,
1.12390297e+00],
[ 1.17619606e+00, -2.13237928e-01, 4.75236599e-01,
-4.86294703e-01, -1.14009102e+00, -9.75666407e-01,
-2.08630946e-01, 7.39433966e-01, -9.23822643e-01,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
-8.89756521e-01],
[-1.64345203e+00, -1.13899272e+00, -1.52069123e+00,
1.19839375e-01, -1.29109087e+00, -1.83823859e+00,
-2.08630946e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, 1.38036423e+00,
-8.89756521e-01],
[ 1.93347298e-01, -6.02054939e-01, 1.90104051e-01,
-2.41116874e-01, 7.97237079e-01, 3.54275572e-01,
8.43963129e+00, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
-8.89756521e-01],
[ 1.41788018e+00, -1.20662449e-01, 8.89138684e-01,
7.66836425e-01, 8.98406980e-01, -5.71872653e-01,
-8.05085429e-02, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
1.12390297e+00],
[ 1.61122748e-02, -5.83539843e-01, 1.44100813e+00,
3.58206709e-01, 8.98406980e-01, -6.81842016e-01,
2.39797466e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[-1.65956431e+00, 1.71233203e+00, 6.68390905e-01,
9.43909302e-01, 1.86480603e+00, -1.16467625e+00,
1.84132751e+00, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[-9.82848764e-01, 3.32031643e-01, 5.19386155e-01,
-1.04906968e-01, -3.09591836e-01, 1.60689534e+00,
-2.08630946e-01, -1.35238581e+00, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[-1.17619606e+00, -4.90964364e-01, -1.41031735e+00,
-1.95736168e+00, -2.76371869e-01, 8.45069797e-02,
-2.08630946e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
4.30644338e+00, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[ 1.04729786e+00, 5.27365904e-01, -7.66469657e-01,
-1.41252206e+00, 8.98406980e-01, -1.65610184e+00,
2.39797466e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[-6.12266443e-01, 5.27365904e-01, 6.13203961e-01,
-5.04230063e-02, -1.58591984e-01, 9.95190764e-01,
-8.05085429e-02, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, 1.38036423e+00,
-8.89756521e-01],
[-1.25675744e+00, -4.90964364e-01, 6.13345137e-02,
-1.14010225e+00, 3.69907498e-01, -1.88806846e+00,
-2.08630946e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[ 8.70062840e-01, -1.08344743e+00, -1.41031735e+00,
-1.20820720e+00, -1.09252607e+00, 7.59156232e-02,
-2.08630946e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[ 3.70582321e-01, -4.90964364e-01, -1.22621969e-01,
-8.67682437e-01, 8.98406980e-01, 3.21628418e-01,
-2.08630946e-01, 7.39433966e-01, -9.23822643e-01,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[ 1.48232928e+00, 1.57063988e-01, -1.22621969e-01,
-1.27631215e+00, 6.79077939e-02, -1.11312811e+00,
-2.08630946e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[ 6.12266443e-01, 2.04560376e+00, -6.19163295e-02,
6.64678996e-01, -7.59213209e-03, 1.63547459e-01,
1.52102150e+00, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, -7.24446475e-01,
1.12390297e+00],
[-1.43399246e+00, -3.98388886e-01, -3.06437275e-02,
-3.22842817e-01, -3.09591836e-01, 3.09600519e-01,
-2.08630946e-01, -1.35238581e+00, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[-1.11174696e+00, 5.27365904e-01, -7.66469657e-01,
-5.04230063e-02, -1.38924078e+00, -9.30991353e-01,
-5.28936955e-01, -1.35238581e+00, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[ 5.15592794e-01, 8.97667820e-01, 3.37269237e-01,
-1.00389234e+00, -1.06459110e+00, -1.34853128e+00,
-7.85181762e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
-8.89756521e-01],
[-1.19230834e+00, 1.82342261e+00, -1.22621969e-01,
1.99272557e+00, 1.20040668e+00, 1.80449654e+00,
-2.72692148e-01, -1.35238581e+00, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
-8.89756521e-01],
[-1.49844156e+00, -4.90964364e-01, -1.77823031e+00,
-2.22978149e+00, 2.94407572e-01, -1.25746290e+00,
-2.08630946e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[-1.46621701e+00, -3.98388886e-01, -1.22480793e-02,
-1.04906968e-01, -9.13591244e-01, 4.49867400e-02,
2.39797466e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
-8.89756521e-01],
[-1.14397151e+00, -1.50929463e+00, -2.05416503e+00,
-1.27631215e+00, -6.11591540e-01, -1.86057612e+00,
-2.08630946e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[-1.09563469e+00, 1.26796974e+00, 1.53312755e-01,
-3.22842817e-01, 1.57790631e+00, 4.84232826e-02,
1.66195615e+00, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, -7.24446475e-01,
-8.89756521e-01],
[ 9.66736489e-02, 6.16238364e-01, 1.16507341e+00,
8.57868989e-02, 3.73285680e-03, 1.37773390e-01,
-1.64473411e-02, -1.35238581e+00, 1.08245885e+00,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[ 2.90020947e-01, 1.54569617e+00, 6.13203961e-01,
9.03046330e-01, 2.18907646e-01, 1.89900146e+00,
8.80409484e-01, -1.35238581e+00, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[-9.98961039e-01, 8.05092341e-01, -2.14600210e-01,
7.66836425e-01, -1.06459110e+00, 7.30576985e-01,
-2.08630946e-01, -1.35238581e+00, -9.23822643e-01,
-1.24880896e+00, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, -7.24446475e-01,
1.12390297e+00],
[-1.72401341e+00, -2.80869697e-02, 2.26881230e+00,
-1.14010225e+00, -1.29109087e+00, -5.97646723e-01,
-1.44569745e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, 1.62605898e+00, -7.24446475e-01,
1.12390297e+00],
[ 1.14397151e+00, 6.44885092e-02, -9.50426139e-01,
2.21996804e-01, 6.79077939e-02, 9.95190764e-01,
8.80409484e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, -7.24446475e-01,
1.12390297e+00],
[ 3.54470046e-01, 8.69895176e-01, 2.14924059e+00,
-5.04230063e-02, 1.35140654e+00, 1.07766779e+00,
-4.00814552e-01, 7.39433966e-01, -9.23822643e-01,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
-8.89756521e-01],
[-9.18399665e-01, 7.31031958e-01, 6.31599609e-01,
2.76480766e-01, -5.36091614e-01, 1.20590677e-01,
-1.04142657e+00, -1.35238581e+00, -9.23822643e-01,
-1.24880896e+00, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[ 1.57900293e+00, -5.83539843e-01, -3.98556692e-01,
-1.86632911e-01, -7.62591392e-01, -9.39582710e-01,
-4.00814552e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[ 1.45010473e+00, 1.54199315e+00, -1.22636086e+00,
3.35482462e+00, -9.64931193e-01, 1.27698726e+00,
-2.72692148e-01, -1.35238581e+00, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, -7.24446475e-01,
1.12390297e+00],
[-2.09459573e-01, -8.61266280e-01, -3.98556692e-01,
-3.22842817e-01, -1.37565079e+00, -1.25574463e+00,
-2.72692148e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[-1.16008379e+00, 1.57063988e-01, 1.07309517e+00,
7.66836425e-01, -8.30920581e-02, 7.93521658e-02,
2.39797466e-01, -1.35238581e+00, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[-1.67567658e+00, -1.04641724e+00, -1.31833910e+00,
-1.95736168e+00, -4.60591688e-01, -4.89395632e-01,
-2.08630946e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, -7.24446475e-01,
1.12390297e+00],
[-3.86694596e-01, 1.54569617e+00, 1.16507341e+00,
2.21996804e-01, 8.98406980e-01, 7.58069326e-01,
2.39797466e-01, 7.39433966e-01, 1.08245885e+00,
8.00762995e-01, -1.05254128e+00, 1.16732059e+00,
-2.32210182e-01, 1.62605898e+00, 1.38036423e+00,
-8.89756521e-01],
[ 1.70790113e+00, 6.19941383e-01, -3.06437275e-02,
-1.14010225e+00, -1.58591984e-01, -3.51933928e-01,
-9.90177608e-01, -1.35238581e+00, 1.08245885e+00,
8.00762995e-01, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
1.12390297e+00],
[-1.45010473e+00, -1.13899272e+00, 6.13345137e-02,
-3.22842817e-01, -9.13591244e-01, 8.59447332e-01,
-2.08630946e-01, -1.35238581e+00, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, -7.24446475e-01,
-8.89756521e-01],
[-1.48232928e+00, -2.13237928e-01, 7.97160443e-01,
3.58206709e-01, -7.59213209e-03, 4.09260254e-01,
-1.04142657e+00, -1.35238581e+00, -9.23822643e-01,
-1.24880896e+00, 9.50081506e-01, -8.56662692e-01,
-2.32210182e-01, -6.14983844e-01, 1.38036423e+00,
-8.89756521e-01],
[ 2.73908672e-01, -2.80869697e-02, 4.29247478e-01,
-2.77893548e-01, -6.11591540e-01, -1.74951985e-01,
-4.00814552e-01, 7.39433966e-01, -9.23822643e-01,
-1.24880896e+00, -1.05254128e+00, 1.16732059e+00,
4.30644338e+00, -6.14983844e-01, 1.38036423e+00,
1.12390297e+00]])
#https://medium.com/analytics-vidhya/evaluating-a-random-forest-model-9d165595ad56
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Instantiate and fit the RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
RandomForestClassifier()
# Make predictions for the test set
y_pred_test = forest.predict(X_test)
# View accuracy score
accuracy_score(y_test, y_pred_test)
0.9534883720930233
# View confusion matrix for test data and predictions
confusion_matrix(y_test, y_pred_test)
array([[11, 2],
[ 0, 30]], dtype=int64)
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred_test)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
cmap=plt.cm.Greens, linewidths=0.2)
# Add labels to the plot
class_names = ['placed','not_placed']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()
# View the classification report for test data and predictions
print(classification_report(y_test, y_pred_test))
precision recall f1-score support
0 1.00 0.85 0.92 13
1 0.94 1.00 0.97 30
accuracy 0.95 43
macro avg 0.97 0.92 0.94 43
weighted avg 0.96 0.95 0.95 43
from sklearn.linear_model import LogisticRegression
#https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
C:\Users\lance\anaconda3\lib\site-packages\statsmodels\compat\pandas.py:61: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import Int64Index as NumericIndex
Optimization terminated successfully.
Current function value: 0.359486
Iterations 7
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.421
Dependent Variable: y AIC: 186.5789
Date: 2022-04-20 16:32 BIC: 240.5091
No. Observations: 215 Log-Likelihood: -77.289
Df Model: 15 LL-Null: -133.39
Df Residuals: 199 LLR p-value: 6.1118e-17
Converged: 1.0000 Scale: 1.0000
No. Iterations: 7.0000
-------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-------------------------------------------------------------------
x1 0.3047 0.2134 1.4283 0.1532 -0.1134 0.7229
x2 1.7088 0.3696 4.6233 0.0000 0.9844 2.4331
x3 1.0688 0.3421 3.1240 0.0018 0.3982 1.7393
x4 1.0069 0.3021 3.3327 0.0009 0.4147 1.5990
x5 -0.4167 0.2312 -1.8021 0.0715 -0.8699 0.0365
x6 -1.1309 0.2821 -4.0082 0.0001 -1.6838 -0.5779
x7 0.3161 0.2310 1.3687 0.1711 -0.1366 0.7688
x8 0.2620 0.2405 1.0892 0.2761 -0.2094 0.7334
x9 0.1692 0.3005 0.5632 0.5733 -0.4197 0.7582
x10 -0.0474 0.2853 -0.1662 0.8680 -0.6066 0.5118
x11 -0.8725 0.5104 -1.7093 0.0874 -1.8729 0.1279
x12 -0.5243 0.5506 -0.9521 0.3410 -1.6035 0.5550
x13 -0.3586 0.2565 -1.3981 0.1621 -0.8613 0.1441
x14 -0.5951 0.2973 -2.0017 0.0453 -1.1779 -0.0124
x15 0.5814 0.2317 2.5098 0.0121 0.1274 1.0355
x16 -0.1698 0.2211 -0.7679 0.4426 -0.6032 0.2636
=================================================================
#https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
LogisticRegression()
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
Accuracy of logistic regression classifier on test set: 0.85
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
[[12 7] [ 3 43]]
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.80 0.63 0.71 19
1 0.86 0.93 0.90 46
accuracy 0.85 65
macro avg 0.83 0.78 0.80 65
weighted avg 0.84 0.85 0.84 65
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
XGBOOST
#https://towardsdatascience.com/beginners-guide-to-xgboost-for-classification-problems-50f75aac5390
# pip
!pip install xgboost
# conda
#conda install -c conda-forge xgboost
Requirement already satisfied: xgboost in c:\users\lance\anaconda3\lib\site-packages (1.6.0) Requirement already satisfied: numpy in c:\users\lance\anaconda3\lib\site-packages (from xgboost) (1.21.5) Requirement already satisfied: scipy in c:\users\lance\anaconda3\lib\site-packages (from xgboost) (1.7.3)
WARNING: Ignoring invalid distribution -illow (c:\users\lance\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -illow (c:\users\lance\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -illow (c:\users\lance\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -illow (c:\users\lance\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -illow (c:\users\lance\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -illow (c:\users\lance\anaconda3\lib\site-packages) WARNING: You are using pip version 21.2.4; however, version 22.0.4 is available. You should consider upgrading via the 'C:\Users\lance\anaconda3\python.exe -m pip install --upgrade pip' command.
X_train_xg, X_test_xg, y_train_xg, y_test_xg= train_test_split(X, y, test_size=0.3, random_state=0)
import xgboost as xgb
xgb_cl = xgb.XGBClassifier()
print(type(xgb_cl))
<class 'xgboost.sklearn.XGBClassifier'>
from sklearn.metrics import accuracy_score
# Init classifier
xgb_cl = xgb.XGBClassifier()
# Fit
xgb_cl.fit(X_train_xg, y_train_xg)
# Predict
preds_xg = xgb_cl.predict(X_test_xg)
# Score
accuracy_score(y_test_xg, preds_xg)
[17:01:18] WARNING: ..\src\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
C:\Users\lance\anaconda3\lib\site-packages\xgboost\sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
0.9230769230769231
from sklearn.metrics import classification_report
print(classification_report(y_test_xg, preds_xg))
precision recall f1-score support
0 0.82 0.95 0.88 19
1 0.98 0.91 0.94 46
accuracy 0.92 65
macro avg 0.90 0.93 0.91 65
weighted avg 0.93 0.92 0.92 65